#### ---- SCRIPT 07: The purpose of this script is to test the ideological balance of guests on the seven selected ---- ####
#### ----programmes using their Twitter/X ideal points. Two tests are conducted (Mann-Whitney U and Hartigan's Dip) ---- ####
#### ---- along with a descriptive raincloud plot. Additional checks are then conducted, asssessing how these results ---- ####
#### ---- vary based on whether misisng user ideal points are imputed using organisational proxies, with MPs removed ---- ####
#### ---- and when splitting the programme episodes into quarterly time intervals.  ---- ####
 
options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # big data wrangling
library(ggridges) # generate ridge plots
library(tibble) # data structuring
library(tidyr) # data tidying
library(diptest) # Hartigan's dip test
library(lubridate) # working with dates

#### ---- LOAD DATA ---- ####

# Twitter user data 
user_data <- fread("user_ideal_point_data_updated.csv")

# Guest masterlist
guest_data <- fread("username_master_list_with_ideal_points.csv")

# TV guest lists
guest_lists <- list.files("guest_lists/",full.names = TRUE) %>%
  lapply(fread)

#### ---- MERGE THE GUEST LISTS TOGETHER AND MATCH TO THEIR IDEAL POINTS ---- ####

# Concatenate the individual guest lists together into one dataset
guest_lists_merged <- rbindlist(guest_lists)

# Match each guest appearance in the dataset to their corresponding ideal point and the ideal point of their
# organisation and organisation affiliates. These will only be used where an individual does not have an ideal point
# of their own.
guest_lists_merged <- left_join(guest_lists_merged,guest_data,by=c("Name" = "name"))

##### ---- TV SHOW IDEOLOGICAL REPRESENTATION ANALYSIS ---- ####

# Where a user does not have an individual ideal point, replace with their organisation's ideal point or the ideal point of their
# organisation's mean ideal point of affiliates
guest_lists_merged$final_ideal_point <- ifelse(is.na(guest_lists_merged$user_ideal_point),
                                               ifelse(is.na(guest_lists_merged$organisation_ideal_point),
                                                      guest_lists_merged$affiliate_mean_ideal_point,
                                                      guest_lists_merged$organisation_ideal_point),guest_lists_merged$user_ideal_point)

# Add all ordinary and elite Twitter users to the dataset to allow for comparisons with these groups alongside the T.V show
# distributions
twitter_ideal_points <- user_data %>% 
  select(user_ideal_point,elite_account) %>%
  rename(Show = elite_account)

guest_lists_merged <- rbind(guest_lists_merged,twitter_ideal_points,fill=TRUE)

## -- Results Without Imputation -- ## 

# Order shows by data availability for benefit of plot interpretability
shows_ideal_point_plot_ordering <- guest_lists_merged %>% 
  group_by(Show) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(Show)

guest_lists_merged$Show <- factor(guest_lists_merged$Show , levels = shows_ideal_point_plot_ordering)

# Raincloud distribution plot

ideal_point_distributions_by_show <- ggplot(guest_lists_merged, aes(x = user_ideal_point, y = Show, fill = Show)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.03, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"),
                     limits = c(0, 1)) +
  labs(y = "", x = "Ideal Point") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none") +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "black")

ggsave("ideal_point_by_show_raincloud_plot.png",
       ideal_point_distributions_by_show, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

##### ---- STATISTICAL TESTS ---- ####

# Mann-Whitney U test (for signifcance between groups)
test_result <- pairwise.wilcox.test(guest_lists_merged$user_ideal_point,guest_lists_merged$Show,
                     p.adjust.method = "bonferroni")

# Generate a signifcance matrix between distributions
p_value_matrix <- test_result$p.value

p_value_matrix_signif <- ifelse(p_value_matrix > 0.05, NA, "Signif.")

# Print matrix 
print(p_value_matrix_signif)

# Hartigan's Dip Test (for multimodality)

x <- dip.test(guest_lists_merged$user_ideal_point,
              simulate.p.value = FALSE)

dip_test_stats <- data.frame()

# Iterate through the dip test results and store in readable data frame
for (show in shows_ideal_point_plot_ordering){
  
  show_data <- guest_lists_merged %>%
    filter(Show == show)
  
  dip_stats <- dip.test(show_data$user_ideal_point,
                        simulate.p.value = FALSE)
  
  show_dip_stats_df <- data.frame(show = show,
                                  count = nrow(show_data[!is.na(user_ideal_point),]),
                                  mean_ideal = mean(show_data$user_ideal_point,na.rm = T),
                                  median_ideal= median(show_data$user_ideal_point,na.rm = T),
                                  s.d = sd(show_data$user_ideal_point,na.rm = T),
                                  min = min(show_data$user_ideal_point,na.rm = T),
                                  max = max(show_data$user_ideal_point,na.rm = T),
                                  dip_stat = dip_stats$statistic,
                                  p_value = dip_stats$p.value)
  
  dip_test_stats <- rbind(dip_test_stats,show_dip_stats_df)
}  

print(dip_test_stats)

#### ---- ADDITIONAL CHECKS ---- ####

#### ---- Results With Imputation ---- #### 

# Order shows by data availability for benefit of plot interpretability
shows_ideal_point_plot_ordering <- guest_lists_merged %>% 
  group_by(Show) %>%
  summarise(median_ideal_point = median(final_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(Show)

guest_lists_merged$Show <- factor(guest_lists_merged$Show , levels = shows_ideal_point_plot_ordering)

# Set final ideal points for the non-guest accounts to their user ideal point
guest_lists_merged$final_ideal_point <- ifelse(is.na(guest_lists_merged$final_ideal_point),
                                               guest_lists_merged$user_ideal_point,
                                               guest_lists_merged$final_ideal_point)

# Raincloud distribution plot
ideal_point_distributions_by_show <- ggplot(guest_lists_merged, aes(x = final_ideal_point, y = Show, fill = Show)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.03, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"),
                     limits = c(0, 1)) +
  labs(y = "", x = "Ideal Point") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none") +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "black")

ggsave("ideal_point_by_show_raincloud_plot_with_imputation.png",
       ideal_point_distributions_by_show, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Tests for statistical significance between groups

# Mann-Whitney U Test
test_result <- pairwise.wilcox.test(guest_lists_merged$final_ideal_point,guest_lists_merged$Show,
                                    p.adjust.method = "bonferroni")

p_value_matrix <- test_result$p.value

p_value_matrix_signif <- ifelse(p_value_matrix > 0.05, NA, "Signif.")

print(p_value_matrix_signif)

# Hartigan's Dip Test
x <- dip.test(guest_lists_merged$final_ideal_point,
              simulate.p.value = FALSE)

dip_test_stats <- data.frame()

for (show in shows_ideal_point_plot_ordering){
  
  show_data <- guest_lists_merged %>%
    filter(Show == show)
  
  dip_stats <- dip.test(show_data$final_ideal_point,
                        simulate.p.value = FALSE)
  
  show_dip_stats_df <- data.frame(show = show,
                                  count = nrow(show_data[!is.na(final_ideal_point),]),
                                  mean_ideal = mean(show_data$final_ideal_point,na.rm = T),
                                  median_ideal= median(show_data$final_ideal_point,na.rm = T),
                                  s.d = sd(show_data$final_ideal_point,na.rm = T),
                                  min = min(show_data$final_ideal_point,na.rm = T),
                                  max = max(show_data$final_ideal_point,na.rm = T),
                                  dip_stat = dip_stats$statistic,
                                  p_value = dip_stats$p.value)
  
  dip_test_stats <- rbind(dip_test_stats,show_dip_stats_df)
}  

# Analysis with no MPs #

# Filter out MPs
prop.table(table(guest_lists_merged$category_name_1)) # 46%

guest_lists_no_mps <- guest_lists_merged %>% 
  filter(guest_lists_merged$category_name_1 != "MP" | is.na(guest_lists_merged$category_name_1))

# Order shows by data availability for benefit of plot interpretability
shows_ideal_point_plot_ordering <- guest_lists_no_mps %>% 
  group_by(Show) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(Show)

guest_lists_no_mps$Show <- factor(guest_lists_no_mps$Show , levels = shows_ideal_point_plot_ordering)

# Raincloud distribution plot
ideal_point_distributions_by_show <- ggplot(guest_lists_no_mps, aes(x = user_ideal_point, y = Show, fill = Show)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.03, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"),
                     limits = c(0, 1)) +
  labs(y = "", x = "Ideal Point") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none") +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "black")

ggsave("ideal_point_by_show_raincloud_plot_without_mps.png",
       ideal_point_distributions_by_show, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Tests for statistical significance between groups

# Mann-Whitney U test (for signifcance between groups)
test_result <- pairwise.wilcox.test(guest_lists_no_mps$user_ideal_point,guest_lists_no_mps$Show,
                                    p.adjust.method = "bonferroni")

p_value_matrix <- test_result$p.value

p_value_matrix_signif <- ifelse(p_value_matrix > 0.05, NA, "Signif.")

# Print matrix 
print(p_value_matrix_signif)

# Hartigan's Dip Test (for multimodality)
x <- dip.test(guest_lists_no_mps$user_ideal_point,
              simulate.p.value = FALSE)

# Iterate through the dip test results and store in readable data frame
dip_test_stats <- data.frame()

for (show in shows_ideal_point_plot_ordering){
  
  show_data <- guest_lists_no_mps %>%
    filter(Show == show)
  
  dip_stats <- dip.test(show_data$user_ideal_point,
                        simulate.p.value = FALSE)
  
  show_dip_stats_df <- data.frame(show = show,
                                  count = nrow(show_data[!is.na(user_ideal_point),]),
                                  mean_ideal = mean(show_data$user_ideal_point,na.rm = T),
                                  median_ideal= median(show_data$user_ideal_point,na.rm = T),
                                  s.d = sd(show_data$user_ideal_point,na.rm = T),
                                  min = min(show_data$user_ideal_point,na.rm = T),
                                  max = max(show_data$user_ideal_point,na.rm = T),
                                  dip_stat = dip_stats$statistic,
                                  p_value = dip_stats$p.value)
  
  dip_test_stats <- rbind(dip_test_stats,show_dip_stats_df)
}  

print(dip_test_stats)

#### ---- Time-series Checks ---- ####

# Filter out non-programme data 
guest_lists_merged <- guest_lists_merged %>% filter(Show != "Ordinary Account" & Show != "Elite Account")

# Group dates into quarters 
guest_lists_merged$Date <- dmy(guest_lists_merged$Date)

guest_lists_merged$date_interval <- factor(lubridate::quarter(guest_lists_merged$Date, type="year.quarter"))

guest_lists_merged <- guest_lists_merged %>% 
  mutate(date_interval = case_when(
    date_interval == "2022.1" ~ "Jan-Mar 22",
    date_interval == "2022.2" ~ "Apr-Jun 22",
    date_interval == "2022.3" ~ "Jul-Sept 22",
    date_interval == "2022.4" ~ "Oct-Dec 22",
    date_interval == "2023.1" ~ "Jan-Mar 23",
    date_interval == "2023.2" ~ "Apr-Jun 23",
    date_interval == "2023.3" ~ "Jul-Sept 23",
    date_interval == "2023.4" ~ "Oct-Dec 23",
    TRUE ~ date_interval
  ))

guest_lists_merged$date_interval <- factor(guest_lists_merged$date_interval,
                                           levels = c("Oct-Dec 23","Jul-Sept 23","Apr-Jun 23","Jan-Mar 23",
                                                      "Oct-Dec 22","Jul-Sept 22","Apr-Jun 22","Jan-Mar 22"))
                                             
# Create an "Overall" group by copying the overall data and binding back to original dataframe
guest_lists_merged_total <- guest_lists_merged
guest_lists_merged_total$Show <- "Overall"

guest_lists_merged <- rbind(guest_lists_merged,guest_lists_merged_total)
 
# Plot
time_series_ideal_plot <- ggplot(guest_lists_merged,aes(x=user_ideal_point,y=date_interval)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.03, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .3, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"),
                     limits = c(0, 1)) +
  labs(y = "", x = "Ideal Point") +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 8)) +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "black") +
  facet_wrap(~Show, scales = "fixed",ncol = 4)

ggsave("ideal_point_time_series.png",
       time_series_ideal_plot, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Tests for statistical significance between time points

# Median points
time_median_test <- guest_lists_merged %>%
  group_by(date_interval,Show) %>%
  summarise(count = n(),
            mean_ideal = mean(user_ideal_point,na.rm=T),
            median_ideal = median(user_ideal_point,na.rm=T))

# Mann-Whitney U Test  (across time between all shows combined)
guest_lists_merged_time <- guest_lists_merged[Show == "Overall",]

pairwise.wilcox.test(guest_lists_merged_time$user_ideal_point,guest_lists_merged_time$date_interval,
                     p.adjust.method = "bonferroni")

#### ---- END ---- ####
